Define variables for coding stoves and for filtering out data.

  • Also set some constants for the stove event analysis
  reimport_data <- FALSE #Set to TRUE if we need to reimport data SUMSARIZED, but this is slower.  Set to FALSE if already imported and no new data has been added.

  source('../r_files/parameter_file_Nigeria_AfDB.R')
  Removed_Houses <- print(HHID_remove)
## [1] "^SHO_03$|^SHO_04$|^SHO_08$|^SHO_10$|^MUS_01$|^AKO_01$|^AKO_02$|^AKO_03$|^AKO_05$|^AKO_06$|^AKO_07$|^AKO_08$"
  Files_with_these_strings_removed <- print(bad_files)
## [1] "0_0_3_P83601_IndiaDMYminave|0_0_3_P46673_Indiaminaves|AKOKA DOWNLOAD 5_CC/|AKOKA DOWNLOAD|AKOKA DOWNLOAD 5_KE/|AKOKA DOWNLOAD 5_CC/|AKOKA DOWNLOAD 5_LPG/|MUSHIN DOWNLOAD 5_CC/|MUSHIN DOWNLOAD 5|MUSHIN DOWNLOAD 5_KE/|SHOMOLU DOWNLOAD 5|SHOMOLU DOWNLOAD 5_CC/|SHOMOLU DOWNLOAD 5_KE/|SHOMOLU DOWNLOAD 5_LPG/|Akoka fourth download CC/|Akoka fouth download|Akoka fourth download|Akoka fourth download LPG/|Mushin fourth download CC/|Mushin fourth download KE/|Shomolu fourth download|Shomolu fourth download CC/|Shomolu fourth download KE/|Shomolu fourth download LPG/"
  print(stove_codes)
##     stove stove_descriptions
## 1      CC          CleanCook
## 2   CC(B)          CleanCook
## 3      LP                LPG
## 4     LPG                LPG
## 5      KE           Kerosene
## 6  KE1(B)           Kerosene
## 7  KE2(B)           Kerosene
## 8   KE(B)           Kerosene
## 9     AMB            Ambient
## 10 AMB(B)            Ambient
## 11    KE1           Kerosene
## 12    KE1           Kerosene
## 13    KE2           Kerosene
## 14  KE(1)           Kerosene
## 15  KE(2)           Kerosene
  source('../r_scripts/load.R')
  source("../r_scripts/functions.R")
  source("../r_scripts/load_data.r")
  source("../r_scripts/plots.R")
  source("../r_scripts/filter_sum_data.R")

Import temperature data and metadata files

Tidy

  • add household id information, get metadata from metadata file and from file names, depending on country
#Import thermocouple data time series output from Sumsarizer.
#datetime_placed is currently based on the metadata.  Flags data as good/bad based on the bad_files and HHID_remove variables.  Those are reapplied directly to the cooking events later, and filtered out if bad.
  if ( reimport_data == TRUE) {
      sumsarized_filtered <- filter_sumsarized(sumsarized,metadata,bad_files,HHID_remove,project_name,stove_codes) #HHID is grabbed from the filename.
          saveRDS(sumsarized_filtered, file ="../r_files/sumsarized_filtered.RDS")
  } else {
      sumsarized_filtered <- readRDS("../r_files/sumsarized_filtered.RDS") #
  }
  • Form cooking events from raw sumsarizer output
  • Group distinct cooking events together. If they are within cooking_group minutes of each other.
  • Only need this for data that went through the web-based sumsarizer, where events are not made automatically.
# # start counting up for cooking events.  Add columns for start and stop times of cooking events, and keep the hhid, loggerid, stove type, field group.  Keep only cooking event time points.  Need a case for start (if i = 1), if data point is within 30 minutes of the last one, if data point is greater than 30 minutes from the last one.

# #Initialize the cooking_events frame.
 cooking_events <- data.frame(start_time=as.POSIXct(character()),
                    end_time=as.POSIXct(character()), group=factor(),  HHID=factor(),
                    logger_id=factor(),stove=factor(), logging_duration_days=as.numeric(),
                    datetime_placed = as.POSIXct(character()), datetime_removal = as.POSIXct(character()),
                    file_indices = as.numeric(), filename=character(),fullsumsarizer_filename=character(),comments=character(),
                    qc_dates=character(),use_flag = as.logical())

 #for each SUM data file
 for (i in unique(sumsarized_filtered$file_indices)) {
   #Grab data from file i, and keep only the entries that are marked as cooking
   temp <- dplyr::filter(sumsarized_filtered,file_indices == i) %>%
      dplyr::filter(state == TRUE) %>% dplyr::filter(!duplicated(datetime)) 
   qc_temp <- "ok" #Default to ok data quality, gets demoted based on later checks.

   if (dim(temp)[1]>1 && any(temp$state==TRUE)) {
      min_time_file = min(temp$datetime)
      max_time_file = max(temp$datetime)
      time_difference <- as.numeric(difftime(temp$datetime,lag(temp$datetime),units="mins"))
      time_difference <- time_difference[!is.na(time_difference)] #
     
      breakstart <- c(0,which((time_difference>cooking_group) == TRUE))+1 #Start of a cooking event
      breakend <- c(which((time_difference>cooking_group) == TRUE),
          if (tail(temp$state,n=1) == TRUE){dim(temp)[1]}) #End of cooking event. 
        #Tail part is in case the time series ends while still cooking...need to account for th

      #Organize/check/fix datetimes of sum placements and removals. If there is a start and end time available for the monitoring period, use it to keep the data from the file.  Disregard this if the datetime_removal is NA, since it means we don't have a fixed known end date.  In this case we assume the end of the file is the end of the monitoring preiod and keep all the data from the given file.  Provide the start and end date in the event-building loop below.
       if (max_time_file>end_date_range | min_time_file<start_date_range) {   #datetime_placed is changed to the local file value if it is below the start_date_range, and datetime_placed is too, according to the end_date_range
             datetime_placed_temp = min_time_file
             datetime_removal_temp = max_time_file 
             qc_temp <- "out_of_placement_range" #These get filtered out, assuming it represents poorly time formatted data.
       } else if (is.na(as.POSIXct(temp$datetime_removal[1])) | is.na(as.POSIXct(temp$datetime_placed[1]))){  #If date is NA, use the file's dates.  They have already been midnight-trimmed in the load_data.r function.
             datetime_placed_temp = min_time_file
             datetime_removal_temp = max_time_file
             qc_temp <- "NA_metadata"
       } else if (min_time_file> as.POSIXct(temp$datetime_placed[1])) {  #If placement time is before time of first data point, switch to first data point.
             datetime_placed_temp = min_time_file
             datetime_removal_temp = max_time_file
             qc_temp <- "placement_before_data"
       } else if (as.POSIXct(min(temp$datetime_placed))> as.POSIXct(max(temp$datetime_removal))) {  #If placement time is greater than the datetime placed, switch them, it's a mistake.
             datetime_placed_temp = min_time_file
             datetime_removal_temp = max_time_file 
             qc_temp <- "placement_greaterthan_removal"
       } else { #If not NA, a value must have been found in the meta data, use it.
             datetime_placed_temp = as.POSIXct(min(temp$datetime_placed))
             datetime_removal_temp = as.POSIXct(max(temp$datetime_removal))
             qc_temp <- "metadata_dates_used"
       }      
      
      #Add cooking events to the cooking_events data frame.
      cooking_events <- rbind(cooking_events,
                  data.frame(start_time= as.POSIXct(temp$datetime[breakstart]),
                  end_time=as.POSIXct(temp$datetime[breakend]), 
                  group=as.factor(temp$group[breakstart]),
                  HHID=as.factor(temp$HHID[breakstart]),
                  logger_id=factor(temp$logger_id[breakstart]),
                  stove=factor(temp$stove[breakstart]),
                  logging_duration_days = as.numeric(difftime(datetime_removal_temp,datetime_placed_temp,units = "days")),
                  datetime_placed = datetime_placed_temp,
                  datetime_removal = datetime_removal_temp,
                  file_indices = temp$file_indices[breakstart], 
                  filename = temp$filename[breakstart],
                  fullsumsarizer_filename = temp$fullsumsarizer_filename[breakstart],
                  comments = temp$comments[1],
                  use_flag=as.logical(rep(1,length(breakstart)[1])),
                  qc_dates = qc_temp))
    } else{
         #If no cooking events are found, still create an entry, though with the use flag as FALSE, so 
         #that we know that there was data collected and zero events in that period.  Set the use_flag to true here to differntiate from the too-short cooking events.
       temp <- dplyr::filter(sumsarized_filtered,file_indices == i)  #Create new temp here so 
       #we can get info about the sample that does not have cooking events.
       cooking_events <- rbind(cooking_events,
                  data.frame(start_time=as.POSIXct(temp$datetime[1]),
                  end_time=as.POSIXct(temp$datetime[1]), 
                  group=as.factor(temp$group[1]),
                  HHID=as.factor(temp$HHID[1]), 
                  logger_id=factor(temp$logger_id[1]),
                  stove=factor(temp$stove[1]), 
                  logging_duration_days = as.numeric(difftime(max(temp$datetime),min(temp$datetime),units = "days")),
                  datetime_placed = min(temp$datetime),
                  datetime_removal = max(temp$datetime),
                  file_indices = temp$file_indices[1], 
                  filename = temp$filename[1],
                  fullsumsarizer_filename = temp$fullsumsarizer_filename[1],
                  comments = temp$comments[1],
                  qc_dates = qc_temp,
                  use_flag=FALSE))
    }
   
 }


#Clean up cooking events. Add better variable names for stoves, and units.
 cooking_events <- dplyr::left_join(cooking_events,
                                              dplyr::select(stove_codes,stove,stove_descriptions),by = "stove") %>%
                          dplyr::mutate(units = "degrees Celsius") %>% #Define units as degrees C
                          dplyr::mutate(duration_minutes = as.numeric(difftime(end_time,start_time,units = "mins"))) %>%
                          dplyr::mutate(qc = if_else(grepl(bad_files,fullsumsarizer_filename,ignore.case=TRUE),"bad","ok")) %>% #Flag data from bad files
                          dplyr::mutate(qc = if_else(grepl(HHID_remove,HHID),"bad",qc)) %>% #Flag data from HHID's that should be removed.
                          dplyr::mutate(qc = if_else(grepl("amb",fullsumsarizer_filename,ignore.case=TRUE),"bad",qc))  #Flag data from HHID's that should be removed.

Remove data from bad files:

  • merge flags with data *Perform filtering on cooking events variable to get to the subset we want.
  #Remove short events, while keeping entries from files with no events.
  ok_cooking_events <-  dplyr::filter(cooking_events,!is.na(stove_descriptions)) %>%
                          dplyr::filter(logging_duration_days >= logging_duration_minimum) %>% # Only keep files longer than a day.
                          #Filter out events shorter than the threshold, while keeping the entries that 
                          #have no uses in the deployments. At this point all events except empty files have use_flag = TRUE.
                          dplyr::filter(!grepl("out_of_placement_range",qc_dates)) %>%#Keep data not marked as placed out of range
                          dplyr::select(filename,fullsumsarizer_filename,start_time,end_time,duration_minutes,
                                HHID,stove,logger_id,group,
                                stove_descriptions,stove,logging_duration_days,datetime_placed,
                                datetime_removal, units,comments,use_flag,qc) %>%
                          dplyr::mutate(start_hour = hour(start_time) + minute(start_time)/60) %>%
                          dplyr::mutate(month_year = format(start_time,"%b-%y")) %>%
                          dplyr::mutate(month_year = factor(month_year, unique(month_year), ordered=TRUE)) %>%
                          dplyr::mutate(day_month_year = as.Date(start_time)) %>%
                          dplyr::mutate(week_year = format(start_time,"%V-%y")) %>%
                          dplyr::mutate(day_of_week = as.factor(weekdays(start_time))) %>%
                          dplyr::mutate(day_of_week = factor(day_of_week, 
                                     levels = c('Monday','Tuesday','Wednesday','Thursday',
                                                'Friday','Saturday','Sunday'))) %>%
                          dplyr::arrange(desc(datetime_removal)) %>%  #Sort so we toss the earlier end dates when deleting distinct values.
                          dplyr::distinct(start_time,duration_minutes,HHID,stove_descriptions, .keep_all = TRUE) %>% 
                          #Handle cases that have duplicated events due to overlapping files. Keep only distinct ones
                          dplyr::mutate(datetime_placed = floor_date(datetime_placed,"minute")) %>%
                          dplyr::mutate(qc = if_else(logging_duration_days < total_days_logged_minimum,"bad",qc)) %>% # Only keep data from house-stove combos with more than this number of days of data.
                          dplyr::arrange(HHID) %>%
                          dplyr::filter(!is.na(HHID)) %>%
                          dplyr::filter(!is.na(datetime_placed))  %>%
                          dplyr::filter(grepl("ok",qc)) %>% #Keep only data marked 'ok'
                          droplevels() #Getting rid of extra levels whos data was filtered out, so unique doesn't freak out.


ok_cooking_events_padded <- data.frame(stringsAsFactors = FALSE)
#Replace NA with 0.
#dplyr_if_else   <- function(x) { mutate_all(x, funs(ifelse(is.na(.), 0, .))) }
uniquers <- unique(ok_cooking_events$fullsumsarizer_filename)
for (i in 1:length(unique(ok_cooking_events$filename))) {

        temp <- dplyr::filter(ok_cooking_events,uniquers[i]==fullsumsarizer_filename) %>%
                dplyr::arrange(start_time)
        
    if (dim(temp)[1]>0) {    
        # generate a time sequence with 1 day intervals to fill in
        # missing dates
        all.dates <- data.frame(dates = seq(temp$datetime_placed[1], temp$datetime_removal[1], by="day"),stringsAsFactors = FALSE) %>%
                  dplyr::mutate(day_month_year = as.Date(dates)) %>%
                  dplyr::filter(!day_month_year %in% temp$day_month_year) #Get rid of extra days
        
        # Convert all dates to a data frame. Note that we're putting
        # the new dates into a column called "start_time" just like the
        # original column. This will allow us to merge the data.
        all.dates.frame <- data.frame(list(start_time=all.dates$dates),list(end_time=all.dates$dates), 
                                      list(day_month_year = as.Date(all.dates$dates)),
                                      list(week_year = format(all.dates$dates,"%V-%y")),
                                      list(day_of_week = weekdays(all.dates$dates)),stringsAsFactors = FALSE) %>%
                  dplyr::mutate(month_year = format(start_time,"%b-%y")) %>%
                  dplyr::mutate(month_year = factor(month_year, unique(month_year), ordered=TRUE))
                         

        # Merge the two datasets: the full dates and original data
        merged.data <- merge(all.dates.frame, temp, all=T) %>%
            tidyr::fill(filename,fullsumsarizer_filename,HHID,stove,logger_id,group,
                        stove_descriptions,logging_duration_days,datetime_placed,
                        datetime_removal,units,comments,
                        start_hour,.direction = c("up")) %>%
            tidyr::fill(filename,fullsumsarizer_filename,HHID,stove,logger_id,group,
                        stove_descriptions,logging_duration_days,datetime_placed,
                        datetime_removal,units,comments,
                        start_hour,.direction = c("down"))  %>%
            dplyr::mutate(use_flag = replace(use_flag, is.na(use_flag), FALSE)) 
        
        
        merged.data %>% mutate_if(is.factor, as.character) -> merged.data

        merged.data$use_flag[is.na(merged.data$use_flag)] <- FALSE
        merged.data[is.na(merged.data)] <- 0
        # The above merge set the new observations to NA.
        # To replace those with a 0, we must first find all the rows
        # and then assign 0 to them.
        #merged.data <-dplyr_if_else(merged.data)

        ok_cooking_events_padded <- rbind(ok_cooking_events_padded,merged.data)
    }
}
    #time format was messed up somehow in making the padded set, correct it here.
    ok_cooking_events_padded <- dplyr::mutate(ok_cooking_events_padded,datetime_removal = as.POSIXct(datetime_removal,origin = "1970-1-1", tz = "GMT")) %>%
      dplyr::arrange((start_time)) %>%
      dplyr::mutate(month_year = format(start_time,"%b-%y")) %>%
      dplyr::mutate(month_year = factor(month_year, unique(month_year), ordered=TRUE)) %>%
      dplyr::mutate(qc = "ok") #New qc values default to zero during padding, but they are all already qc-filtered, so reset to TRUE

    #Finish filtering the dataset.  Kept 'bad' and short events to get the full padded dataset.  Now can remove them
    ok_cooking_events <- dplyr::filter(ok_cooking_events,duration_minutes>cooking_duration_minimum | use_flag==FALSE) 

Summarize data

ok_datasummary <- summarise_all(ok_cooking_events,funs(n_distinct(.)))

#Summarize use for each household and stove.
stats_grouped_by_hhid_stove <- dplyr::group_by(ok_cooking_events_padded,stove_descriptions,HHID,group) %>%
    # calculate events per day for each household and stove.  Can then take summary stats of those.
    dplyr::summarise(events_per_day=sum(use_flag,na.rm=TRUE)/length(unique(day_month_year)),
                     minutes_per_day_stove_hhid = sum(duration_minutes,na.rm=TRUE)/length(unique(day_month_year)),
                     minutes_per_event_stove_hhid = sum(duration_minutes,na.rm=TRUE)/sum(use_flag,na.rm=TRUE),
                     days_logged_per_group_by_stove=sum(length(unique(day_month_year)), na.rm = TRUE),
                     events = n()) 
kable(stats_grouped_by_hhid_stove, digits=2)
stove_descriptions HHID group events_per_day minutes_per_day_stove_hhid minutes_per_event_stove_hhid days_logged_per_group_by_stove events
CleanCook AKO_01B AKO 0.00 0.00 NaN 13 13
CleanCook AKO_02B AKO 1.46 33.85 23.16 13 25
CleanCook AKO_03B AKO 3.38 78.46 23.18 13 44
CleanCook AKO_04 AKO 0.32 8.74 27.67 95 109
CleanCook AKO_05B AKO 0.75 38.75 51.67 8 10
CleanCook AKO_06B AKO 0.00 0.00 NaN 13 13
CleanCook AKO_09 AKO 0.59 26.76 45.50 34 43
CleanCook AKO_10 AKO 1.31 36.77 28.15 62 106
CleanCook MUS_02 MUS 0.33 7.53 22.59 81 95
CleanCook MUS_03 MUS 0.20 2.63 13.33 76 83
CleanCook MUS_04 MUS 0.35 8.02 23.00 86 98
CleanCook MUS_05 MUS 1.41 40.26 28.55 39 68
CleanCook MUS_06 MUS 0.68 57.88 84.79 104 157
CleanCook MUS_07 MUS 1.21 42.42 35.04 95 145
CleanCook MUS_08 MUS 0.75 18.67 25.00 83 120
CleanCook MUS_09 MUS 1.70 45.83 26.89 115 224
CleanCook MUS_10 MUS 1.11 30.57 27.44 70 129
CleanCook SHO_01 SHO 0.94 27.57 29.24 70 99
CleanCook SHO_02 SHO 0.83 24.33 29.20 60 81
CleanCook SHO_05 SHO 0.70 15.06 21.64 79 115
CleanCook SHO_06 SHO 0.45 11.63 26.10 92 108
CleanCook SHO_07 SHO 0.90 29.20 32.44 50 63
CleanCook SHO_09 SHO 0.36 6.43 18.00 70 81
Kerosene AKO_01B AKO 1.46 74.62 51.05 13 20
Kerosene AKO_02B AKO 0.00 0.00 NaN 13 13
Kerosene AKO_03B AKO 1.85 186.92 101.25 13 24
Kerosene AKO_04 AKO 0.90 47.74 52.86 93 119
Kerosene AKO_05B AKO 0.42 20.83 50.00 12 12
Kerosene AKO_06B AKO 1.08 73.08 67.86 13 17
Kerosene AKO_09 AKO 0.74 52.80 71.35 50 72
Kerosene AKO_10 AKO 2.36 176.94 74.94 108 262
Kerosene MUS_02 MUS 3.12 188.31 60.34 124 389
Kerosene MUS_03 MUS 0.75 59.50 79.33 20 28
Kerosene MUS_04 MUS 0.86 60.63 70.44 79 99
Kerosene MUS_05 MUS 2.05 78.64 38.44 66 142
Kerosene MUS_06 MUS 2.24 233.77 104.56 106 257
Kerosene MUS_07 MUS 0.69 35.57 51.67 61 78
Kerosene MUS_08 MUS 2.03 121.39 59.86 72 156
Kerosene MUS_09 MUS 1.28 67.72 52.72 123 185
Kerosene MUS_10 MUS 2.58 180.33 70.00 92 273
Kerosene SHO_01 SHO 1.86 113.33 61.03 84 165
Kerosene SHO_02 SHO 1.76 159.73 90.76 75 141
Kerosene SHO_05 SHO 3.89 274.78 70.61 46 179
Kerosene SHO_06 SHO 0.00 0.00 NaN 84 84
Kerosene SHO_07 SHO 0.12 7.50 60.00 32 32
Kerosene SHO_09 SHO 1.39 75.42 54.27 59 94
LPG AKO_04 AKO 1.14 31.40 27.55 93 136
LPG SHO_01 SHO 0.35 7.82 22.63 55 62
LPG SHO_02 SHO 0.00 0.00 NaN 9 9
LPG SHO_05 SHO 0.63 18.96 30.24 67 80
LPG SHO_06 SHO 1.03 44.05 42.89 74 103
#Calculate summary for the overall groups and stoves from the household-wise results above.  Should this be weighted in case one hh has way more data?
stats_by_stove_group <- dplyr::group_by(stats_grouped_by_hhid_stove,stove_descriptions,group) %>%
    dplyr::summarise(mean_events_per_day=mean(events_per_day, na.rm = TRUE),
                       median_events_per_day=median(events_per_day, na.rm = TRUE),
                       sd_events_per_day=sd(events_per_day, na.rm = TRUE),
                       mean_minutes_per_day_stove_hhid=mean(minutes_per_day_stove_hhid, na.rm = TRUE),
                       median_minutes_per_day_stove_hhid=median(minutes_per_day_stove_hhid, na.rm = TRUE),
                       sd_minutes_per_day_stove_hhid=sd(minutes_per_day_stove_hhid, na.rm = TRUE),
                       mean_days_logged=mean(days_logged_per_group_by_stove, na.rm = TRUE),
                       house_stoves=n()) %>%
    dplyr::filter(!is.na(stove_descriptions)) 
kable(stats_by_stove_group, digits=2)
stove_descriptions group mean_events_per_day median_events_per_day sd_events_per_day mean_minutes_per_day_stove_hhid median_minutes_per_day_stove_hhid sd_minutes_per_day_stove_hhid mean_days_logged house_stoves
CleanCook AKO 0.98 0.67 1.11 27.92 30.31 25.95 31.38 8
CleanCook MUS 0.86 0.75 0.53 28.20 30.57 19.76 83.22 9
CleanCook SHO 0.70 0.76 0.24 19.04 19.70 9.31 70.17 6
Kerosene AKO 1.10 0.99 0.77 79.12 62.94 68.23 39.38 8
Kerosene MUS 1.73 2.03 0.87 113.98 78.64 70.43 82.56 9
Kerosene SHO 1.50 1.57 1.42 105.13 94.38 103.24 63.33 6
LPG AKO 1.14 1.14 NA 31.40 31.40 NA 93.00 1
LPG SHO 0.50 0.49 0.44 17.71 13.39 19.21 51.25 4
#Calculate summary for the overall stoves from the household-wise results above.  Should this be weighted in case one hh has way more data?
#Same as above but without the group, just by stove.
stats_by_stove <- dplyr::group_by(stats_grouped_by_hhid_stove,stove_descriptions) %>%
    dplyr::summarise(mean_events_per_day=mean(events_per_day, na.rm = TRUE),
                       median_events_per_day=median(events_per_day, na.rm = TRUE),
                       sd_events_per_day=sd(events_per_day, na.rm = TRUE),
                       mean_minutes_per_day_stove_hhid=mean(minutes_per_day_stove_hhid, na.rm = TRUE),
                       median_minutes_per_day_stove_hhid=median(minutes_per_day_stove_hhid, na.rm = TRUE),
                       sd_minutes_per_day_stove_hhid=sd(minutes_per_day_stove_hhid, na.rm = TRUE),
                        house_stoves=n()) %>%
    dplyr::filter(!is.na(stove_descriptions)) 
kable(stats_by_stove, digits=2)
stove_descriptions mean_events_per_day median_events_per_day sd_events_per_day mean_minutes_per_day_stove_hhid median_minutes_per_day_stove_hhid sd_minutes_per_day_stove_hhid house_stoves
CleanCook 0.86 0.75 0.72 25.71 26.76 19.81 23
Kerosene 1.45 1.39 1.00 99.55 74.62 77.16 23
LPG 0.63 0.63 0.47 20.45 18.96 17.73 5
#Summarize use for each household and stove, by month
stats_grouped_by_hhid_stove_month <- dplyr::group_by(ok_cooking_events_padded,stove_descriptions,HHID,group,month_year) %>%
    # calculate events per day for each household and stove.  Can then take summary stats of those.
    dplyr::summarise(events_per_day=sum(use_flag,na.rm=TRUE)/length(unique(day_month_year)),
                     minutes_per_day_stove_hhid = sum(duration_minutes,na.rm=TRUE)/length(unique(day_month_year)),
                     minutes_per_event_stove_hhid = sum(duration_minutes,na.rm=TRUE)/sum(use_flag,na.rm=TRUE),
                     days_logged_per_group_by_stove=sum(length(unique(day_month_year)), na.rm = TRUE))
kable(stats_grouped_by_hhid_stove_month, digits=2)
stove_descriptions HHID group month_year events_per_day minutes_per_day_stove_hhid minutes_per_event_stove_hhid days_logged_per_group_by_stove
CleanCook AKO_01B AKO Jan-18 0.00 0.00 NaN 11
CleanCook AKO_01B AKO Feb-18 0.00 0.00 NaN 2
CleanCook AKO_02B AKO Jan-18 1.27 39.09 30.71 11
CleanCook AKO_02B AKO Feb-18 2.50 5.00 2.00 2
CleanCook AKO_03B AKO Jan-18 3.27 77.27 23.61 11
CleanCook AKO_03B AKO Feb-18 4.00 85.00 21.25 2
CleanCook AKO_04 AKO Oct-17 0.63 10.53 16.67 19
CleanCook AKO_04 AKO Nov-17 0.00 0.00 NaN 12
CleanCook AKO_04 AKO Dec-17 0.00 0.00 NaN 22
CleanCook AKO_04 AKO Jan-18 0.00 0.00 NaN 30
CleanCook AKO_04 AKO Feb-18 1.50 52.50 35.00 12
CleanCook AKO_05B AKO Jan-18 0.67 41.67 62.50 6
CleanCook AKO_05B AKO Feb-18 1.00 30.00 30.00 2
CleanCook AKO_06B AKO Jan-18 0.00 0.00 NaN 11
CleanCook AKO_06B AKO Feb-18 0.00 0.00 NaN 2
CleanCook AKO_09 AKO Nov-17 0.00 0.00 NaN 1
CleanCook AKO_09 AKO Dec-17 0.68 36.84 53.85 19
CleanCook AKO_09 AKO Feb-18 0.50 15.00 30.00 14
CleanCook AKO_10 AKO Oct-17 2.56 86.67 33.91 9
CleanCook AKO_10 AKO Nov-17 0.00 0.00 NaN 1
CleanCook AKO_10 AKO Dec-17 1.04 21.92 21.11 26
CleanCook AKO_10 AKO Jan-18 1.42 35.00 24.71 12
CleanCook AKO_10 AKO Feb-18 1.00 36.43 36.43 14
CleanCook MUS_02 MUS Oct-17 1.27 32.73 25.71 11
CleanCook MUS_02 MUS Nov-17 0.62 11.90 19.23 21
CleanCook MUS_02 MUS Dec-17 0.00 0.00 NaN 31
CleanCook MUS_02 MUS Jan-18 0.00 0.00 NaN 18
CleanCook MUS_03 MUS Oct-17 0.70 11.00 15.71 10
CleanCook MUS_03 MUS Nov-17 0.80 9.00 11.25 10
CleanCook MUS_03 MUS Dec-17 0.00 0.00 NaN 19
CleanCook MUS_03 MUS Jan-18 0.00 0.00 NaN 31
CleanCook MUS_03 MUS Feb-18 0.00 0.00 NaN 6
CleanCook MUS_04 MUS Oct-17 0.94 24.38 26.00 16
CleanCook MUS_04 MUS Nov-17 0.94 18.75 20.00 16
CleanCook MUS_04 MUS Dec-17 0.00 0.00 NaN 22
CleanCook MUS_04 MUS Jan-18 0.00 0.00 NaN 30
CleanCook MUS_04 MUS Feb-18 0.00 0.00 NaN 2
CleanCook MUS_05 MUS Oct-17 2.00 45.00 22.50 2
CleanCook MUS_05 MUS Nov-17 1.38 36.25 26.36 24
CleanCook MUS_05 MUS Dec-17 1.38 46.92 33.89 13
CleanCook MUS_06 MUS Oct-17 4.00 361.25 90.31 16
CleanCook MUS_06 MUS Nov-17 0.00 0.00 NaN 28
CleanCook MUS_06 MUS Dec-17 0.37 12.63 34.29 19
CleanCook MUS_06 MUS Jan-18 0.00 0.00 NaN 24
CleanCook MUS_06 MUS Feb-18 0.00 0.00 NaN 17
CleanCook MUS_07 MUS Oct-17 0.73 30.00 40.91 15
CleanCook MUS_07 MUS Nov-17 1.17 32.50 27.86 24
CleanCook MUS_07 MUS Dec-17 1.28 49.31 38.65 29
CleanCook MUS_07 MUS Jan-18 1.76 63.33 35.95 21
CleanCook MUS_07 MUS Feb-18 0.33 6.67 20.00 6
CleanCook MUS_08 MUS Oct-17 0.75 19.00 25.33 20
CleanCook MUS_08 MUS Nov-17 0.00 0.00 NaN 28
CleanCook MUS_08 MUS Dec-17 0.83 17.83 21.58 23
CleanCook MUS_08 MUS Jan-18 1.00 7.50 7.50 4
CleanCook MUS_08 MUS Feb-18 3.00 91.25 30.42 8
CleanCook MUS_09 MUS Oct-17 2.90 75.00 25.86 20
CleanCook MUS_09 MUS Nov-17 1.45 46.36 31.88 22
CleanCook MUS_09 MUS Dec-17 1.45 31.29 21.56 31
CleanCook MUS_09 MUS Jan-18 1.15 38.46 33.33 26
CleanCook MUS_09 MUS Feb-18 1.94 48.75 25.16 16
CleanCook MUS_10 MUS Oct-17 2.25 75.00 33.33 4
CleanCook MUS_10 MUS Nov-17 0.95 27.00 28.42 20
CleanCook MUS_10 MUS Dec-17 0.67 17.62 26.43 21
CleanCook MUS_10 MUS Jan-18 1.44 37.20 25.83 25
CleanCook SHO_01 SHO Oct-17 1.33 39.05 29.29 21
CleanCook SHO_01 SHO Nov-17 0.94 33.89 35.88 18
CleanCook SHO_01 SHO Dec-17 0.77 25.38 33.00 13
CleanCook SHO_01 SHO Jan-18 0.00 0.00 NaN 6
CleanCook SHO_01 SHO Feb-18 0.92 14.17 15.45 12
CleanCook SHO_02 SHO Oct-17 1.29 38.57 30.00 7
CleanCook SHO_02 SHO Nov-17 0.86 26.67 31.11 21
CleanCook SHO_02 SHO Dec-17 1.08 25.38 23.57 13
CleanCook SHO_02 SHO Jan-18 0.53 17.65 33.33 17
CleanCook SHO_02 SHO Feb-18 0.00 0.00 NaN 2
CleanCook SHO_05 SHO Oct-17 0.83 50.00 60.00 6
CleanCook SHO_05 SHO Nov-17 0.77 16.00 20.87 30
CleanCook SHO_05 SHO Dec-17 0.30 7.33 24.44 30
CleanCook SHO_05 SHO Jan-18 2.00 27.14 13.57 7
CleanCook SHO_05 SHO Feb-18 0.67 0.00 0.00 6
CleanCook SHO_06 SHO Oct-17 0.81 14.76 18.24 21
CleanCook SHO_06 SHO Nov-17 0.63 22.22 35.29 27
CleanCook SHO_06 SHO Dec-17 0.25 5.71 22.86 28
CleanCook SHO_06 SHO Jan-18 0.00 0.00 NaN 3
CleanCook SHO_06 SHO Feb-18 0.00 0.00 NaN 13
CleanCook SHO_07 SHO Oct-17 0.71 17.86 25.00 14
CleanCook SHO_07 SHO Nov-17 1.00 30.00 30.00 4
CleanCook SHO_07 SHO Dec-17 0.57 14.29 25.00 7
CleanCook SHO_07 SHO Jan-18 1.67 69.17 41.50 12
CleanCook SHO_07 SHO Feb-18 0.54 12.31 22.86 13
CleanCook SHO_09 SHO Oct-17 0.50 12.73 25.45 22
CleanCook SHO_09 SHO Nov-17 0.00 0.00 NaN 15
CleanCook SHO_09 SHO Dec-17 0.50 7.14 14.29 14
CleanCook SHO_09 SHO Jan-18 0.28 3.89 14.00 18
CleanCook SHO_09 SHO Feb-18 2.00 0.00 0.00 1
Kerosene AKO_01B AKO Jan-18 1.55 82.73 53.53 11
Kerosene AKO_01B AKO Feb-18 1.00 30.00 30.00 2
Kerosene AKO_02B AKO Jan-18 0.00 0.00 NaN 11
Kerosene AKO_02B AKO Feb-18 0.00 0.00 NaN 2
Kerosene AKO_03B AKO Jan-18 1.91 195.45 102.38 11
Kerosene AKO_03B AKO Feb-18 1.50 140.00 93.33 2
Kerosene AKO_04 AKO Oct-17 0.81 39.38 48.46 16
Kerosene AKO_04 AKO Nov-17 0.73 34.55 47.50 11
Kerosene AKO_04 AKO Dec-17 0.57 28.70 50.77 23
Kerosene AKO_04 AKO Jan-18 1.28 63.10 49.46 29
Kerosene AKO_04 AKO Feb-18 0.93 67.14 72.31 14
Kerosene AKO_05B AKO Jan-18 0.36 20.00 55.00 11
Kerosene AKO_05B AKO Feb-18 1.00 30.00 30.00 1
Kerosene AKO_06B AKO Jan-18 1.09 70.00 64.17 11
Kerosene AKO_06B AKO Feb-18 1.00 90.00 90.00 2
Kerosene AKO_09 AKO Oct-17 1.90 135.00 71.05 10
Kerosene AKO_09 AKO Nov-17 0.00 0.00 NaN 11
Kerosene AKO_09 AKO Dec-17 0.64 25.45 40.00 11
Kerosene AKO_09 AKO Jan-18 0.69 63.12 91.82 16
Kerosene AKO_09 AKO Feb-18 0.00 0.00 NaN 2
Kerosene AKO_10 AKO Oct-17 1.83 106.09 58.10 23
Kerosene AKO_10 AKO Nov-17 3.55 225.45 63.59 11
Kerosene AKO_10 AKO Dec-17 2.40 185.33 77.22 30
Kerosene AKO_10 AKO Jan-18 2.33 216.00 92.57 30
Kerosene AKO_10 AKO Feb-18 2.29 153.57 67.19 14
Kerosene MUS_02 MUS Oct-17 2.71 124.29 45.79 21
Kerosene MUS_02 MUS Nov-17 2.41 116.67 48.46 27
Kerosene MUS_02 MUS Dec-17 3.58 276.77 77.30 31
Kerosene MUS_02 MUS Jan-18 3.66 225.86 61.79 29
Kerosene MUS_02 MUS Feb-18 3.00 153.75 51.25 16
Kerosene MUS_03 MUS Oct-17 1.50 119.00 79.33 10
Kerosene MUS_03 MUS Nov-17 0.00 0.00 NaN 10
Kerosene MUS_04 MUS Nov-17 0.33 12.22 36.67 18
Kerosene MUS_04 MUS Dec-17 1.06 70.00 65.76 31
Kerosene MUS_04 MUS Jan-18 0.97 80.00 82.76 30
Kerosene MUS_05 MUS Oct-17 1.75 92.50 52.86 8
Kerosene MUS_05 MUS Nov-17 1.58 70.42 44.47 24
Kerosene MUS_05 MUS Dec-17 2.62 77.50 29.52 8
Kerosene MUS_05 MUS Jan-18 2.42 82.50 34.14 24
Kerosene MUS_05 MUS Feb-18 2.00 80.00 40.00 2
Kerosene MUS_06 MUS Oct-17 4.40 454.00 103.18 10
Kerosene MUS_06 MUS Nov-17 4.00 435.71 108.93 28
Kerosene MUS_06 MUS Dec-17 1.96 200.74 102.26 27
Kerosene MUS_06 MUS Jan-18 0.67 71.11 106.67 27
Kerosene MUS_06 MUS Feb-18 0.71 50.00 70.00 14
Kerosene MUS_07 MUS Oct-17 0.78 37.78 48.57 18
Kerosene MUS_07 MUS Nov-17 1.80 120.00 66.67 5
Kerosene MUS_07 MUS Dec-17 0.25 11.50 46.00 20
Kerosene MUS_07 MUS Jan-18 0.80 28.00 35.00 5
Kerosene MUS_07 MUS Feb-18 0.77 40.00 52.00 13
Kerosene MUS_08 MUS Nov-17 2.08 120.83 58.00 12
Kerosene MUS_08 MUS Dec-17 1.79 72.63 40.59 19
Kerosene MUS_08 MUS Jan-18 2.40 165.60 69.00 25
Kerosene MUS_08 MUS Feb-18 1.69 110.62 65.56 16
Kerosene MUS_09 MUS Oct-17 1.22 3.48 2.86 23
Kerosene MUS_09 MUS Nov-17 0.96 9.64 10.00 28
Kerosene MUS_09 MUS Dec-17 1.67 98.67 59.20 30
Kerosene MUS_09 MUS Jan-18 1.38 111.38 80.75 29
Kerosene MUS_09 MUS Feb-18 1.00 137.69 137.69 13
Kerosene MUS_10 MUS Oct-17 2.67 153.33 57.50 3
Kerosene MUS_10 MUS Nov-17 2.80 211.67 75.60 30
Kerosene MUS_10 MUS Dec-17 2.40 164.67 68.61 30
Kerosene MUS_10 MUS Jan-18 2.41 162.22 67.38 27
Kerosene MUS_10 MUS Feb-18 4.00 230.00 57.50 2
Kerosene SHO_01 SHO Oct-17 2.11 144.44 68.42 18
Kerosene SHO_01 SHO Nov-17 1.69 80.62 47.78 16
Kerosene SHO_01 SHO Dec-17 1.88 135.00 72.00 16
Kerosene SHO_01 SHO Jan-18 2.05 127.37 62.05 19
Kerosene SHO_01 SHO Feb-18 1.47 70.00 47.73 15
Kerosene SHO_02 SHO Oct-17 2.25 183.75 81.67 8
Kerosene SHO_02 SHO Nov-17 2.57 225.71 87.78 21
Kerosene SHO_02 SHO Dec-17 1.67 165.00 99.00 12
Kerosene SHO_02 SHO Jan-18 1.47 130.53 88.57 19
Kerosene SHO_02 SHO Feb-18 0.80 87.33 109.17 15
Kerosene SHO_05 SHO Oct-17 4.00 235.00 58.75 2
Kerosene SHO_05 SHO Nov-17 4.83 345.22 71.53 23
Kerosene SHO_05 SHO Dec-17 2.17 136.67 63.08 6
Kerosene SHO_05 SHO Jan-18 2.25 240.00 106.67 8
Kerosene SHO_05 SHO Feb-18 4.14 212.86 51.38 7
Kerosene SHO_06 SHO Oct-17 0.00 0.00 NaN 24
Kerosene SHO_06 SHO Nov-17 0.00 0.00 NaN 28
Kerosene SHO_06 SHO Dec-17 0.00 0.00 NaN 19
Kerosene SHO_06 SHO Feb-18 0.00 0.00 NaN 13
Kerosene SHO_07 SHO Oct-17 0.57 34.29 60.00 7
Kerosene SHO_07 SHO Nov-17 0.00 0.00 NaN 5
Kerosene SHO_07 SHO Dec-17 0.00 0.00 NaN 20
Kerosene SHO_09 SHO Oct-17 1.90 103.50 54.47 20
Kerosene SHO_09 SHO Nov-17 0.80 25.33 31.67 15
Kerosene SHO_09 SHO Dec-17 0.67 30.00 45.00 3
Kerosene SHO_09 SHO Jan-18 1.53 98.95 64.83 19
Kerosene SHO_09 SHO Feb-18 0.50 15.00 30.00 2
LPG AKO_04 AKO Oct-17 0.90 31.50 35.00 20
LPG AKO_04 AKO Nov-17 1.00 20.00 20.00 1
LPG AKO_04 AKO Dec-17 1.73 48.67 28.08 30
LPG AKO_04 AKO Jan-18 1.00 26.07 26.07 28
LPG AKO_04 AKO Feb-18 0.50 5.71 11.43 14
LPG SHO_01 SHO Nov-17 0.43 12.86 30.00 14
LPG SHO_01 SHO Dec-17 1.86 35.71 19.23 7
LPG SHO_01 SHO Jan-18 0.00 0.00 NaN 19
LPG SHO_01 SHO Feb-18 0.00 0.00 NaN 15
LPG SHO_02 SHO Nov-17 0.00 0.00 NaN 9
LPG SHO_05 SHO Nov-17 0.64 6.82 10.71 22
LPG SHO_05 SHO Dec-17 0.07 2.00 30.00 30
LPG SHO_05 SHO Jan-18 2.00 68.75 34.38 8
LPG SHO_05 SHO Feb-18 1.43 72.86 51.00 7
LPG SHO_06 SHO Oct-17 1.33 59.33 44.50 15
LPG SHO_06 SHO Nov-17 1.11 54.64 49.35 28
LPG SHO_06 SHO Dec-17 1.39 46.67 33.60 18
LPG SHO_06 SHO Feb-18 0.00 0.00 NaN 13
#Calculate summary for the overall groups and stoves from the household-wise results above. 
stats_by_stove_group_month <- dplyr::group_by(stats_grouped_by_hhid_stove_month,stove_descriptions,group,month_year) %>%
    dplyr::summarise(mean_events_per_day=mean(events_per_day, na.rm = TRUE),
                       median_events_per_day=median(events_per_day, na.rm = TRUE),
                       sd_events_per_day=sd(events_per_day, na.rm = TRUE),
                       mean_minutes_per_day_stove_hhid=mean(minutes_per_day_stove_hhid, na.rm = TRUE),
                       median_minutes_per_day_stove_hhid=median(minutes_per_day_stove_hhid, na.rm = TRUE),
                       sd_minutes_per_day_stove_hhid=sd(minutes_per_day_stove_hhid, na.rm = TRUE),
                       households=n(),
                       mean_days_logged=mean(days_logged_per_group_by_stove, na.rm = TRUE))
kable(stats_by_stove_group_month, digits=2)
stove_descriptions group month_year mean_events_per_day median_events_per_day sd_events_per_day mean_minutes_per_day_stove_hhid median_minutes_per_day_stove_hhid sd_minutes_per_day_stove_hhid households mean_days_logged
CleanCook AKO Oct-17 1.59 1.59 1.36 48.60 48.60 53.84 2 14.00
CleanCook AKO Nov-17 0.00 0.00 0.00 0.00 0.00 0.00 3 4.67
CleanCook AKO Dec-17 0.57 0.68 0.53 19.59 21.92 18.53 3 22.33
CleanCook AKO Jan-18 0.95 0.67 1.19 27.58 35.00 29.26 7 13.14
CleanCook AKO Feb-18 1.31 1.00 1.36 27.99 22.50 29.74 8 6.25
CleanCook MUS Oct-17 1.73 1.27 1.16 74.82 32.73 109.80 9 12.67
CleanCook MUS Nov-17 0.81 0.94 0.53 20.20 18.75 16.40 9 21.44
CleanCook MUS Dec-17 0.66 0.67 0.61 19.51 17.62 19.28 9 23.11
CleanCook MUS Jan-18 0.67 0.50 0.75 18.31 3.75 24.63 8 22.38
CleanCook MUS Feb-18 0.88 0.17 1.28 24.44 3.33 37.85 6 9.17
CleanCook SHO Oct-17 0.91 0.82 0.33 28.83 28.21 15.65 6 15.17
CleanCook SHO Nov-17 0.70 0.81 0.37 21.46 24.44 12.21 6 19.17
CleanCook SHO Dec-17 0.58 0.54 0.31 14.21 10.81 9.16 6 17.50
CleanCook SHO Jan-18 0.75 0.40 0.87 19.64 10.77 26.58 6 10.50
CleanCook SHO Feb-18 0.69 0.60 0.74 4.41 0.00 6.86 6 7.83
Kerosene AKO Oct-17 1.51 1.83 0.61 93.49 106.09 49.04 3 16.33
Kerosene AKO Nov-17 1.42 0.73 1.87 86.67 34.55 121.43 3 11.00
Kerosene AKO Dec-17 1.20 0.64 1.04 79.83 28.70 91.38 3 21.33
Kerosene AKO Jan-18 1.15 1.18 0.78 88.80 66.56 77.35 8 16.25
Kerosene AKO Feb-18 0.96 1.00 0.75 63.84 48.57 59.75 8 4.88
Kerosene MUS Oct-17 2.15 1.75 1.22 140.63 119.00 147.65 7 13.29
Kerosene MUS Nov-17 1.77 1.80 1.25 121.91 116.67 136.34 9 20.22
Kerosene MUS Dec-17 1.92 1.88 1.01 121.56 88.08 86.06 8 24.50
Kerosene MUS Jan-18 1.84 1.89 1.05 115.83 96.94 64.21 8 24.50
Kerosene MUS Feb-18 1.88 1.69 1.24 114.58 110.62 66.25 7 10.86
Kerosene SHO Oct-17 1.81 2.01 1.41 116.83 123.97 89.27 6 13.17
Kerosene SHO Nov-17 1.65 1.24 1.85 112.82 52.98 142.06 6 18.00
Kerosene SHO Dec-17 1.06 1.17 0.97 77.78 82.50 75.80 6 12.67
Kerosene SHO Jan-18 1.83 1.79 0.39 149.21 128.95 62.17 4 16.25
Kerosene SHO Feb-18 1.38 0.80 1.63 77.04 70.00 84.24 5 10.40
LPG AKO Oct-17 0.90 0.90 NA 31.50 31.50 NA 1 20.00
LPG AKO Nov-17 1.00 1.00 NA 20.00 20.00 NA 1 1.00
LPG AKO Dec-17 1.73 1.73 NA 48.67 48.67 NA 1 30.00
LPG AKO Jan-18 1.00 1.00 NA 26.07 26.07 NA 1 28.00
LPG AKO Feb-18 0.50 0.50 NA 5.71 5.71 NA 1 14.00
LPG SHO Oct-17 1.33 1.33 NA 59.33 59.33 NA 1 15.00
LPG SHO Nov-17 0.54 0.53 0.46 18.58 9.84 24.61 4 18.25
LPG SHO Dec-17 1.10 1.39 0.93 28.13 35.71 23.28 3 18.33
LPG SHO Jan-18 1.00 1.00 1.41 34.38 34.38 48.61 2 13.50
LPG SHO Feb-18 0.48 0.00 0.82 24.29 0.00 42.06 3 11.67
#Write summary stats to an excel workbook
list_of_datasets <- list("By Stove Group and HHID" = stats_grouped_by_hhid_stove, "By Stove Group" = stats_by_stove_group,
                         "By Stove Type" = stats_by_stove,"By HHID and Monthly" = stats_grouped_by_hhid_stove_month,
                         "By Group and Monthly" = stats_by_stove_group_month)
openxlsx::write.xlsx(list_of_datasets, file = paste0("Summary Statistics ",format(now(),"%y-%m-%d"),".xlsx"))

Plot all data time series

  • Plot usage rates (uses/day) by stove type, and region
#Plot time series for each household, colored by filename.  For data completion purposes.
field_timeseries_plot(sumsarized_filtered, "stove_temp", "datetime", "HHID", "stove_descriptions","qc") 

Plot data time series, removing flagged files and households

  • Plot usage rates (uses/day) by stove type, and region
#Plot time series for each household, colored by filename.  Filtered out bad qc data
field_timeseries_plot(dplyr::filter(sumsarized_filtered,grepl("ok",qc)),"stove_temp", "datetime", "HHID", "stove_descriptions","qc") 

List of files without associated HHID and metadata from the tracking sheet.

files_without_hhid <- unique(dplyr::filter(sumsarized_filtered,is.na(HHID)) %>% dplyr::select(fullsumsarizer_filename))   #Data is removed in the next step if there is no HHID
files_without_hhid
## # A tibble: 0 x 1
## # ... with 1 variables: fullsumsarizer_filename <chr>
files_without_metadata <- unique(dplyr::filter(sumsarized_filtered,is.na(stove_descriptions)) %>% dplyr::select(fullsumsarizer_filename))   #Note the name of files for which there is there is no matching filename in the tracking sheet, ergo no metadata from the tracking sheet. Not currently being filtered out, but this should be attended to by fixing any data files/tracking entries that appear in this variable.  Make sure that the actual file name matches what is listed in the tracking sheet.  Files names should also be unique.
files_without_metadata
## # A tibble: 0 x 1
## # ... with 1 variables: fullsumsarizer_filename <chr>

Boxplot usage results

give.n <- function(x){
   return(c(y = 0, label = length(x)))
}

#To make box and whiskers quantiles rather than IQRs.
f <- function(x) {
  r <- quantile(x, probs = c(0.05, 0.25, 0.5, 0.75, 0.95))
  names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
  r
}


#Plot average uses per day, by group, using the padded data set. (one data point per household-stove combo)
g1<- dplyr::group_by(ok_cooking_events_padded,stove_descriptions,HHID,group) %>%
  dplyr::summarise(avg_events_per_day=sum(use_flag,na.rm=TRUE)/length(unique(day_month_year))) %>%
  ggplot(aes(x=stove_descriptions, y = avg_events_per_day)) +
  stat_summary(fun.data = f, geom="boxplot") +  
    geom_jitter(height = 0,width = 0.2,alpha = 0.25) +
  #facet_grid(stove_use_category~group,scales = "free", space = "free") + 
  stat_summary(fun.data = give.n, geom = "text") + 
  facet_grid(~group,scales = "free", space = "free") + 
  labs(y="Average uses per day",x="") + 
  ggtitle("Average uses per day for each household-stove combination") + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1)) + 
  theme(legend.title = element_blank())
g1

#ggsave(filename="../figures/AverageUsesPerDay_byGrouppadded.png", plot=g1,width = 6, height = 4)


#Plot uses per day, by group, using the padded data set. (one data point per day-household-stove combo)
g11<- dplyr::group_by(ok_cooking_events_padded, stove_descriptions, day_month_year, HHID, group) %>%
  summarise(events_per_day=sum(use_flag,na.rm=TRUE)) %>%
  ggplot(aes(events_per_day)) +
  geom_histogram() +
  #stat_summary(fun.data = f, geom="boxplot") +  
  #geom_jitter(height = 0,width = 0.2,alpha = 0.25) +
  #facet_grid(stove_use_category~group,scales = "free", space = "free") + 
  #stat_summary(fun.data = give.n, geom = "text") + 
  facet_grid(group~stove_descriptions,scales = "free", space = "free") + 
  labs(y="days",x="Number of uses by day") + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1)) + 
  theme(legend.title = element_blank())
g11

##ggsave(filename="../figures/UsesPerDayDistribution_byGrouppadded.png", plot=g11,width = 6, height = 4)


#Plot average uses per day for all groups together
g2 <- dplyr::group_by(ok_cooking_events_padded,stove_descriptions,HHID) %>%
  dplyr::summarise(avg_events_per_day=sum(use_flag,na.rm=TRUE)/length(unique(day_month_year))) %>%
  ggplot(aes(x=stove_descriptions, y = avg_events_per_day)) +
  stat_summary(fun.data = f, geom="boxplot") +    geom_jitter(height = 0,width = 0.2,alpha = 0.25) + 
  labs(y="Average uses/day",x="") + 
  stat_summary(fun.data = give.n, geom = "text") + 
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) 
g2

#ggsave(filename="../figures/AverageUsesPerDay_All.png", plot=g2,width = 6, height = 4)

#Plot average minutes used per day for each stove-household combination
g3<-  dplyr::group_by(ok_cooking_events_padded,stove_descriptions,HHID,group) %>%
  dplyr::summarise(avg_events_per_day=sum(duration_minutes,na.rm=TRUE)/length(unique(day_month_year))) %>%
  ggplot(aes(x=stove_descriptions, y = avg_events_per_day)) +
  stat_summary(fun.data = f, geom="boxplot") +   geom_jitter(height = 0,width = 0.2,alpha = 0.25) +
  facet_grid(~group,scales = "free", space = "free") + 
  labs(y="Average minutes used per day",x="") + 
  stat_summary(fun.data = give.n, geom = "text") + 
  theme(axis.text.x = element_text(angle = 60, hjust = 1))
g3

#ggsave(filename="../figures/AverageTimeUsedPerDay_All.png", plot=g3,width = 6, height = 4)

#Average event duration
g4<-  dplyr::group_by(ok_cooking_events_padded,stove_descriptions,HHID,group) %>%
  dplyr::summarise(minutes_per_day_stove_hhid=sum(duration_minutes,na.rm=TRUE)/sum(use_flag,na.rm=TRUE)) %>%  
  ggplot(aes(x=stove_descriptions, y = minutes_per_day_stove_hhid)) + 
  stat_summary(fun.data = f, geom="boxplot") + 
  geom_jitter(height = 0,width = 0.2,alpha = 0.25) +
  stat_summary(fun.data = give.n, geom = "text") + 
  labs(y="Average event duration by household (minutes)",x="") + 
  theme(axis.text.x = element_text(angle = 60, hjust = 1))
g4

#ggsave(filename="../figures/AverageTimeUsedPerEvent_byHousehold.png", plot=g4,width = 6, height = 4)

Time trend plots

#Plot event duration per day by day of week.  Do not use the padded data set here, since there are non-events included in it, and they have an associated time.
g5 <- ggplot(ok_cooking_events, aes(x=day_of_week, y = duration_minutes)) + 
    geom_violin(fun.data = f) +    geom_jitter(height = 0,width = 0.2,alpha = 0.1) +
    facet_wrap(stove_descriptions~group,scales = "free") + 
    labs(y="Cooking event duration (minutes)",x="") + 
    theme(axis.text.x = element_text(angle = 60, hjust = 1))
g5

#ggsave(g5,filename=paste("../figures/DurationbyDayofWeek",y,".png",sep=""))

#Monthly usage summary

monthlyboxplots <-    ggplot(stats_grouped_by_hhid_stove_month,aes(x=month_year, y = events_per_day,color = stove_descriptions)) +
    stat_summary(fun.data = f, geom="boxplot",position = position_dodge(width=0.75)) +
    geom_jitter(alpha = 0.25,position = position_dodge(width=0.75)) +
    labs(y="Events/day",x="") + 
    facet_wrap(stove_descriptions~group,scales = "free") + 
    ggtitle("Monthly trends") + 
    theme(axis.text.x = element_text(angle = 60, hjust = 1),legend.title = element_blank())
monthlyboxplots

#ggsave(monthlyboxplots,filename=paste("../figures/monthlyboxplots",y,".png",sep=""),width = 7, height = 2)


#Plot %stove-days, where stove-day is defined as whether the stove is used on the given day or not.
  #Calculate percent stove-days.
g7 <-dplyr::group_by(ok_cooking_events_padded,day_month_year,stove_descriptions,group) %>%
      dplyr::distinct(HHID,stove_descriptions,day_month_year, .keep_all = TRUE) %>% #No duplicate events from the same hh contributing to stove-days.
      dplyr::mutate(days_used_day_stove_grouped = 100*sum(use_flag, na.rm=TRUE)/(sum(use_flag, na.rm=TRUE)+sum(!use_flag, na.rm=TRUE))) %>%
      ggplot(aes(x=day_month_year, y = days_used_day_stove_grouped,color = stove_descriptions)) + 
    geom_point(alpha = 0.1) + geom_smooth() + #method='lm',formula=y~x) +
    facet_grid(~group,scales = "free", space = "free") + 
    labs(y="% stove days",x="") + 
    theme(legend.title = element_blank()) + 
    theme(axis.text.x = element_text(angle = 60, hjust = 1))
  g7

          #ggsave(g7,filename=paste("../figures/PercentStoveDaysByGroup.png",sep=""))


#Time of day trend.  Based on start time, change it to middle of event?  Remove groups with less than 5 hh's?
#Use unpadded set to get time of day of event starts.
g8 <- ggplot(ok_cooking_events, aes(start_hour, fill = stove_descriptions, 
                      colour = stove_descriptions)) + geom_density(alpha = 0.1) +
      labs(y="Time of day use density",x="Hour of day") +
      ggtitle("stove use density") + 
      facet_grid(~group,scales = "free", space = "free") + 
      theme(plot.title = element_text(hjust = 0.5)) +
      ylim(0, 0.15)
g8

#ggsave(g8,filename=paste("../figures/Density",y,".png",sep=""))


#Usage time series by minutes used each day, by group and stove type  
 g12<- dplyr::group_by(ok_cooking_events_padded,stove_descriptions,day_month_year,group) %>%
  dplyr::summarise(mean_daily_use_tseries=mean(duration_minutes,na.rm=TRUE)) %>%
  ggplot(aes(day_month_year, y=mean_daily_use_tseries,color = stove_descriptions)) + #,label=sprintf("%0.2f", round(usefraction, digits = 2)))) +
  geom_smooth()+
  geom_point(alpha = 0.25)+
  facet_grid(~group~stove_descriptions,scales = "free", space = "free") + 
  ggtitle("Daily average uses by group and stove") + 
  labs(y="Minutes per day",x="") +
  ylim(0,200)
 g12

#ggsave(filename="../figures/Overall_usage_fraction.png", plot=g12,width = 6, height = 4)

Usage fraction result plots

#Daily usage fraction by group. Includes all hh.  We could filter out houses with fewer than x sums to reduce any potential bias from a certain sum type burning up or something.
g10<- dplyr::select(ok_cooking_events_padded,stove_descriptions,start_time,duration_minutes) %>%
  thicken('day', col = 'day') %>% 
  group_by(stove_descriptions, day) %>%
  summarise(avg=mean(duration_minutes,na.rm=TRUE)) %>%
  ggplot(aes(day, avg)) +
  geom_bar(aes(fill = stove_descriptions), 
           col = "black",
           stat = 'identity', 
           position = "fill",size=0) +
  ggtitle("Usage fraction") + 
  labs(y="Usage fraction",x="")  +
  theme(legend.title = element_blank())
g10

#Overall usage fraction by group.  
 g11<- dplyr::select(ok_cooking_events_padded,stove_descriptions,day_month_year,duration_minutes) %>%
  group_by(stove_descriptions) %>%
  summarise(avg=mean(duration_minutes,na.rm=TRUE)) %>%
  dplyr::mutate(usefraction = avg/sum(avg)) %>%
  ggplot(aes(x=1, usefraction,label=sprintf("%0.2f", round(usefraction, digits = 2)))) +
  geom_bar(aes(fill = stove_descriptions), 
           col = "black",
           stat = 'identity', 
           position = "fill",size=0) +
  geom_text(size = 5, position = position_stack(vjust = 0.5))+
  ggtitle("Overall usage fraction") + 
  labs(y="Usage fraction",x="")  +
  theme(legend.title = element_blank())
g11

#ggsave(filename="../figures/Overall_usage_fraction.png", plot=g12,width = 6, height = 4)


#Overall usage fraction by group.  
 g111<- dplyr::select(ok_cooking_events_padded,stove_descriptions,day_month_year,duration_minutes,group) %>%
  group_by(stove_descriptions,group) %>%
  summarise(avg=mean(duration_minutes,na.rm=TRUE)) %>%
  dplyr::mutate(usefraction = avg/sum(avg)) %>%
  ggplot(aes(x=1, usefraction,label=sprintf("%0.2f", round(usefraction, digits = 2)))) +
  geom_bar(aes(fill = stove_descriptions), 
           col = "black",
           stat = 'identity', 
           position = "fill",size=0) +
  geom_text(size = 5, position = position_stack(vjust = 0.5))+
  ggtitle("Overall usage fraction") + 
  facet_grid(~group,scales = "free", space = "free") + 
  labs(y="Usage fraction",x="")  +
  theme(legend.title = element_blank())
g111

#ggsave(filename="../figures/Overall_usage_fraction_bygroup.png", plot=g111,width = 6, height = 4)

Plot deployment details

 #Barplot by day of fraction of stoves monitored... not too interesting but neat coding to look at.
g9<- dplyr::select(ok_cooking_events_padded,stove_descriptions,start_time,duration_minutes) %>%
  thicken('day', col = 'day') %>% #Map data to a higher level variable.  Compresses two data points from one day into one day
  count(stove_descriptions, day) %>%
  ggplot(aes(day, n)) +
  ggtitle("Daily stove monitored by fraction") + 
  labs(y="Monitored fraction",x="")  +
  geom_bar(aes(fill = stove_descriptions), 
           col = "black",
           stat = 'identity', 
           position = "fill", size=0)
g9

#Plot days sampled per file
g6<-  dplyr::distinct(ok_cooking_events_padded,fullsumsarizer_filename,logging_duration_days,stove_descriptions) %>% 
  ggplot(aes(x=stove_descriptions, y = logging_duration_days)) + 
  stat_summary(fun.data = f, geom="boxplot") + geom_jitter(width = 0.05,alpha = 0.25) + 
  #geom_text_repel( aes(label= ifelse(logging_duration_days > 30,as.character(fullsumsarizer_filename),''))) + 
  #facet_grid(~group,scales = "free", space = "free") + 
  labs(y="Days logged per file",x="") + 
  theme(axis.text.x = element_text(angle = 60, hjust = 1))
g6

#ggsave(filename="../figures/DaysSampled_byGroup.pdf", plot=g6)

Summary

Temperature was measured for 23 houses between 2017-10-06 00:01:00 and 2018-02-21 21:15:00. There are no cooking events for homes: MUS_01B, SHO_03B, SHO_04B, SHO_08B, SHO_10B, AKO_07B, AKO_08B.

Temperature data is expected to be missing for: no tests.

Save files

  • save data
  saveRDS(ok_cooking_events, file = "../r_files/ok_cooking_events.RDS")